Get issues from GitHub repositories and save them into a file. I'll follow this Kaggle competition data format because it's similar.
In [1]:
# Personal access token (GitHub)
TOKEN = None
with open("github.token", "r") as gt:
TOKEN = gt.read()
USER = "aaossa"
In [2]:
from collections import namedtuple
# Repositories
# Note: Use NamedTuple [1] "if you were going to create a
# bunch of instances of a class [...] and not change the
# attributes after you them in __init__ [..]" [2]
# [1]: https://docs.python.org/3/library/collections.html#collections.namedtuple
# [2]: http://stackoverflow.com/a/9872434/3281097
Repository = namedtuple("Repository", ["owner", "repo"])
REPOS = list()
REPOS.append(Repository(owner="IIC2233-2015-1", repo="syllabus"))
REPOS.append(Repository(owner="IIC2233-2015-2", repo="syllabus"))
REPOS.append(Repository(owner="IIC2233-2016-1", repo="syllabus"))
REPOS.append(Repository(owner="IIC2233-2016-02", repo="Syllabus"))
# Issues
Issue = namedtuple("Issue", ["number", "title", "body", "labels", "url"])
In [3]:
import re
from requests import Session
# Maybe should be 'page=(\d+)[^?]+?>; rel="last"' (not tested)
PAGE_REGEX = re.compile('page=(\d+)(&state=all)?>; rel="last"')
ROOT = "https://api.github.com"
SESSION = Session()
SESSION.auth = (USER, TOKEN)
In [4]:
def get_issues_for_repository(repository, session=SESSION):
"""
[IN] repository <Repository>: Repository objet with owner and name of the repo
[IN] session <requests.Session> (opt): Session object used to send requests to the API
Asks for every page of issues and returns a list with every issue
in the repository.
[OUT] issues <map<Issue>>: List of issues (Issue), each one with its number, title
body, labels and url.
"""
issues = list()
# List issues for each repository [1]
# [1]: https://developer.github.com/v3/issues/#list-issues-for-a-repository
endpoint_url = "{root}/repos/{owner}/{repo}/issues"
endpoint_url = endpoint_url.format(root=ROOT, owner=repository.owner, repo=repository.repo)
# Traversing with Pagination [2]
# [2]: https://developer.github.com/guides/traversing-with-pagination/#basics-of-pagination
req = session.get(endpoint_url, params={"page": 1, "state": "all"})
_ = PAGE_REGEX.search(req.headers.get("link"))
number_of_pages = 1 if _ is None else int(_.group(1))
issues += process_issues(req.json())
for page in range(2, number_of_pages + 1):
req = session.get(endpoint_url, params={"page": page, "state": "all"})
issues += process_issues(req.json())
return issues
def process_issues(issues_page):
"""
[IN] issues_page <list>: List from the response (json) given by the API
Filters PRs and create an Issue object from each dictionary returned by the API
[OUT] processed_issues<map<Issue>>: A map object with every issue (Issue), ignoring PRs
"""
# Ignore pull requests (PR)
filtered_issues = filter(lambda issue: "pull_request" not in issue.keys(), issues_page)
# Convert each issue dictionary in a Issue namedtuple
processed_issues = map(lambda issue: Issue(number=issue.get("number"),
title=issue.get("title"),
body=issue.get("body"),
labels=list(map(lambda label: label.get("name"), issue.get("labels"))),
url=issue.get("url")), filtered_issues)
return processed_issues
Using aaossa/aaossa.github.io
In [5]:
# TODO: Use PEP257 [1]
# [1]: https://www.python.org/dev/peps/pep-0257/
help(get_issues_for_repository)
In [6]:
# TODO: Use PEP257
help(process_issues)
In [7]:
# Retrieveing issues from a repository
issues_repo_demo = get_issues_for_repository(Repository(owner="aaossa", repo="aaossa.github.io"))
for issue in issues_repo_demo:
print("#{number} - {title}".format(number=issue.number, title=issue.title))
In [8]:
# Each Issue object contains only the relevant information
issue_demo = issues_repo_demo[0]
output = """\
#{number} - {title}
[{labels}]
{body}
Link: {url}"""
print(output.format(
number=issue_demo.number,
title=issue_demo.title,
labels=", ".join(issue_demo.labels),
body=issue_demo.body,
url=issue_demo.url))
In [ ]: